(1) Data Cleaning & Exploration

library(tidyverse)
library(dplyr)
library(tidyr)
library(lubridate)
library(forecast)
library(tseries)
library(modeltime)
library(tidymodels)
library(prophet)
library(timetk)


setwd("~/Desktop/Data Science Projects/Oil & Gas Forecasting")

data <- read.csv("well_data.csv")

“series” dataframe

series <- data %>%
  select(period, well_name) |>
  mutate(value = 1) %>% 
  pivot_wider(names_from = well_name, values_from = value, values_fill = list(value = 0)) %>%
  arrange(period)

series |>
  slice_head(n=20)
## # A tibble: 20 × 60
##    period     FIELD11A FIELD12 FIELD13 FIELD14 FIELD15A FIELD16 FIELD17 FIELD178
##    <chr>         <dbl>   <dbl>   <dbl>   <dbl>    <dbl>   <dbl>   <dbl>    <dbl>
##  1 1980-07-01        0       0       0       0        0       0       0        0
##  2 1980-08-01        0       0       0       0        0       0       0        0
##  3 1980-09-01        0       0       0       0        0       0       0        0
##  4 1980-10-01        0       0       0       0        0       0       0        0
##  5 1980-11-01        0       0       0       0        0       0       0        0
##  6 1980-12-01        0       0       0       0        0       0       0        0
##  7 1981-02-01        0       0       0       0        0       0       0        0
##  8 1981-03-01        0       0       0       0        0       0       0        0
##  9 1981-04-01        0       0       0       0        0       0       0        0
## 10 1981-05-01        0       0       0       0        0       0       0        0
## 11 1981-06-01        0       0       0       0        0       0       0        0
## 12 1981-07-01        0       0       0       0        0       0       0        0
## 13 1981-08-01        0       0       0       0        0       0       0        0
## 14 1981-09-01        0       0       0       0        0       0       0        0
## 15 1981-10-01        0       0       0       0        0       0       0        0
## 16 1981-11-01        0       0       0       0        0       0       0        0
## 17 1981-12-01        0       0       0       0        0       0       0        0
## 18 1982-01-01        0       0       0       0        0       0       0        0
## 19 1982-02-01        0       0       0       0        0       0       0        0
## 20 1982-03-01        0       0       0       0        0       0       0        0
## # ℹ 51 more variables: FIELD19 <dbl>, FIELD198 <dbl>, FIELD1B <dbl>,
## #   FIELD20 <dbl>, FIELD21 <dbl>, FIELD211 <dbl>, FIELD216 <dbl>,
## #   FIELD221 <dbl>, FIELD223 <dbl>, FIELD282 <dbl>, FIELD31 <dbl>,
## #   FIELD32 <dbl>, FIELD34A <dbl>, FIELD35 <dbl>, FIELD37 <dbl>,
## #   FIELD39A <dbl>, FIELD4 <dbl>, FIELD43 <dbl>, FIELD5 <dbl>, FIELD51 <dbl>,
## #   FIELD51A <dbl>, FIELD52 <dbl>, FIELD53 <dbl>, FIELD53A <dbl>,
## #   FIELD55 <dbl>, FIELD55D <dbl>, FIELD56 <dbl>, FIELD57 <dbl>, …


Well Characteristics Data Frame

well_characteristics <- data %>%
  rename(date = period) |>
  group_by(well_name) %>%
  summarise(
    Avg_Gas_to_Oil_Ratio = mean(gas_total / oil, na.rm = TRUE),
    Months_of_Production = n(),
    Initial_Production_Date = min(date),
    Avg_Monthly_Gas_Decline_Rate = {
      gas_diff <- diff(gas_total)
      gas_lag <- lag(gas_total, default = first(gas_total))[-1]  
      rate <- gas_diff / gas_lag
      rate[is.infinite(rate)] <- NA 
      mean(rate, na.rm = TRUE)
    }
  ) %>%
  ungroup()

print(well_characteristics)
## # A tibble: 59 × 5
##    well_name Avg_Gas_to_Oil_Ratio Months_of_Production Initial_Production_Date
##    <chr>                    <dbl>                <int> <chr>                  
##  1 FIELD11A                  23.7                   91 1985-03-01             
##  2 FIELD12                   23.2                   87 1982-11-01             
##  3 FIELD13                   29.7                  140 1986-09-01             
##  4 FIELD14                   23.9                   80 1986-07-01             
##  5 FIELD15A                  23.4                  119 1988-02-01             
##  6 FIELD16                   23.3                    4 1987-06-01             
##  7 FIELD17                   24.3                   89 1990-04-01             
##  8 FIELD178                  22.8                   11 1986-05-01             
##  9 FIELD19                   23.5                   35 1990-08-01             
## 10 FIELD198                  23.7                   31 1986-11-01             
## # ℹ 49 more rows
## # ℹ 1 more variable: Avg_Monthly_Gas_Decline_Rate <dbl>


(2) Forecasting

Filtering out Wells < 24 months and data with zero production

wells_ge_24 <- well_characteristics |>
  filter(Months_of_Production >= 24)

cleaned_data <- data |>
  filter(well_name %in% wells_ge_24$well_name & oil != 0 & gas_total != 0) |>
  rename(date = period)


Data Cleaning & Exploration

Combine data for all Wells into a single aggregated dataframe

data_aggregated <- cleaned_data %>%
  mutate(date = as.Date(date, format = "%Y-%m-%d")) |>
  group_by(date) %>%
  summarise(total_oil = sum(oil, na.rm = TRUE),
            total_gas = sum(gas_total, na.rm = TRUE)) |>
  arrange(date)

head(data_aggregated)
## # A tibble: 6 × 3
##   date       total_oil total_gas
##   <date>         <dbl>     <dbl>
## 1 1980-07-01     2136.    51426.
## 2 1980-08-01     8435.   228568.
## 3 1980-09-01    11560.   312667.
## 4 1980-10-01     1075.    10971.
## 5 1980-11-01     1954.    36393.
## 6 1980-12-01     7270.   122097.


Plot the full time series to observe the behavior of the data.

ggplot(data_aggregated, aes(x = date, y = total_oil)) +
  geom_line(color = "blue") + 
  labs(title = "Oil Production Over Time",
       x = "Month",
       y = "barrels/d") +
  theme_minimal()


The data appears to display high volatility between 1980 and 2010 with the trend showing stabilization after 2010. Due to this, I made the decision to remove the data prior to 2010 and focus on 2010-2018 data for forecasting. Given that the goal is to forecast the next 6 months, the last 8 years of data seems to be more applicable.

2010 - present Data

data_subset <- data_aggregated |>
  filter(date >= as.Date("2010-01-01"))


data_subset_oil <- data_subset |>
  select(date, total_oil)

data_subset_oil |> plot_time_series(date, total_oil)


Looking at the plot above, we can see that there may be an outlier between 2015 and 2016. I’ll go ahead and include a step for handling outliers.

Handling Outliers

Q1 <- quantile(data_subset_oil$total_oil, 0.25)
Q3 <- quantile(data_subset_oil$total_oil, 0.75)
IQR <- Q3 - Q1

lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

outliers <- which(data_subset_oil$total_oil < lower_bound | data_subset_oil$total_oil > upper_bound)

median_value <- median(data_subset_oil$total_oil, na.rm = TRUE)
data_subset_oil$total_oil[outliers] <- median_value


Let’s take a look at the data after handling outliers

data_subset_oil |> plot_time_series(date, total_oil)


Test for Stationarity

I’ll use the ADF test to test for stationarity

strtyr <- min(year(data_subset_oil$date))

ts_data <- ts(data_subset_oil$total_oil,
              frequency = 12,
              start = c(strtyr, 1))

adf_oil <- adf.test(ts_data, alternative = "stationary")
print(adf_oil)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_data
## Dickey-Fuller = -1.7646, Lag order = 4, p-value = 0.6743
## alternative hypothesis: stationary


With a p-value of 0.6743, the results show no-stationarity. I’ll try first differencing.

Differencing

diff_ts_data <- diff(ts_data) 
adf_test_diff <- adf.test(diff_ts_data, alternative = "stationary")
print(adf_test_diff)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff_ts_data
## Dickey-Fuller = -4.8192, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

P-value is now below 0.05, showing stationarity after differencing.


I’ll now go ahead and split the data into training and test sets and observe the performance of the models auto_arima, prophet, prophet_xgboost, glmnet, and auto_arima_boost.

Model Testing

Train/Test splits

splits <- time_series_split(
  data_subset_oil,
  assess     = "12 months",
  cumulative = TRUE
)

splits %>%
  tk_time_series_cv_plan() %>%
  plot_time_series_cv_plan(date, total_oil)


Model Fitting

# FORECAST ----

# * AUTO ARIMA ----
model_arima <- arima_reg() %>%
  set_engine("auto_arima") %>%
  fit(total_oil ~ date, training(splits))

# * Prophet ----
model_prophet <- prophet_reg(
  seasonality_yearly = FALSE,
) %>%
  set_engine("prophet") %>%
  fit(total_oil ~ date, training(splits))

# * Prophet XGBoost----
model_prophet_xg <- prophet_boost(
  seasonality_yearly = FALSE
) %>%
  set_engine("prophet_xgboost") %>%
  fit(total_oil ~ date + as.numeric(date) + month(date, label = TRUE),
      training(splits))

# * Machine Learning - GLM ----
library(glmnet)
model_glmnet <- linear_reg(penalty = 0.1
                           ,mixture = 0.5) %>%
  set_engine("glmnet") %>%
  fit(
    total_oil ~ month(date, label = TRUE)
    + as.numeric(date),
    training(splits)
  )

# Auto ARIMA with XGBoost
model_arima_boost <- arima_boost() |>
  set_engine("auto_arima_xgboost") |>
  fit(total_oil ~ date + as.numeric(date) + month(date, label = TRUE),
      training(splits))


Model Comparison

model_tbl <- modeltime_table(
  model_arima,
  model_prophet,
  model_prophet_xg,
  model_glmnet,
  model_arima_boost
)


Calculate the accuracy of the models on the training set and the test set.

calib_tbl_train <- model_tbl %>%
  modeltime_calibrate(training(splits))

calib_tbl <- model_tbl %>%
  modeltime_calibrate(testing(splits))

calib_tbl_train %>% modeltime_accuracy()
## # A tibble: 5 × 9
##   .model_id .model_desc                .type   mae  mape  mase smape  rmse   rsq
##       <int> <chr>                      <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1         1 ARIMA(0,1,1)               Fitt…  908.  8.77 0.964  8.64 1212. 0.822
## 2         2 PROPHET                    Fitt… 2111. 20.1  2.24  18.8  2454. 0.258
## 3         3 PROPHET W/ XGBOOST ERRORS  Fitt…  135.  1.38 0.143  1.36  182. 0.996
## 4         4 GLMNET                     Test  2093. 20.0  2.22  18.7  2441. 0.266
## 5         5 ARIMA(0,1,1) W/ XGBOOST E… Fitt…  266.  2.62 0.283  2.59  375. 0.983
calib_tbl %>% modeltime_accuracy()
## # A tibble: 5 × 9
##   .model_id .model_desc              .type   mae  mape  mase smape  rmse     rsq
##       <int> <chr>                    <chr> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1         1 ARIMA(0,1,1)             Test   693.  6.11  1.57  6.39  874. NA     
## 2         2 PROPHET                  Test  2337. 21.1   5.30 23.9  2487.  0.685 
## 3         3 PROPHET W/ XGBOOST ERRO… Test  1177. 10.4   2.67 11.2  1522.  0.839 
## 4         4 GLMNET                   Test  2335. 21.0   5.30 23.9  2518.  0.417 
## 5         5 ARIMA(0,1,1) W/ XGBOOST… Test   714.  6.36  1.62  6.61  933.  0.0503

Based on the summaries of the models between both the training set and the test set, the model ARIMA (0,1,1) with XGBoost seems to be performing the best out of the 4 other models after reviewing the rmse and mae values between both the training set and the test set.

Plot Residuals

Plot the residuals of the training set

residuals_timeplot_train <- model_tbl |>
  modeltime_calibrate(training(splits)) |>
  modeltime_residuals() |>
  plot_modeltime_residuals(
    .type = "timeplot",
    .interactive = TRUE
  )

residuals_timeplot_train

Looking at the residuals, the models Prophet with XGBoost, and ARIMA(0,1,1) with XGBoost seem to be capturing the overall trend and variability in the data reasonably well.


Plot ACF

Plot the ACF of the residuals for the training set

residuals_acf_train <- model_tbl |>
  modeltime_calibrate(training(splits)) |>
  modeltime_residuals() |>
  plot_modeltime_residuals(
    .type = "acf",
    .interactive = TRUE
  )

residuals_acf_train

ARIMA(0,1,1), Prophet with XBoost, and ARIMA(0,1,1) with XGBoost show no significant auto-correlation, although ARIMA (0,1,1) with XGBoost crosses the bounds of the confidence interval slighlty more than ARIMA (0,1,1) and Prophet with XGBoost.

Forecast on Original data

Display the forecast for the next 6 months using the 5 models that were tested, for comparison.

future_forecast_tbl <- calib_tbl %>%
  modeltime_refit(data_subset_oil) %>%
  modeltime_forecast(
    h           = "6 months",
    actual_data = data_subset_oil
  ) %>%
  plot_modeltime_forecast()

future_forecast_tbl


In summary, I started my exploratory analysis by evaluating the long term trend of the oil production data and the benefit of having the full data during forecasting. I concluded that the data from 2010 and onwards seemed more suitable and applicable for forecasting the next 6 months.

Outliers were observed during the period between 2015 and 2016 that could’ve affected the modeling results moving forward. I went ahead and addressed the outliers using the IQR method for outlier detection.

In the forecasting phase, I utilized the following models ARIMA, ARIMA with XGBoost, Prophet, Prophet with XGBoost, and Elastic Net (GLMNET). Based on the results of the models and the distribution of the residuals, it appears that ARIMA with XGBoost performs the best based on the rmse and mae along with the residual plots of both the training and the test sets.

(3) Visualization

insert_well_forecast <- function(well) {
  
  df <- cleaned_data |> 
    filter(well_name == well) |>
    mutate(Months_of_Production = row_number()) |>
    rename(total_oil = oil) |>
    mutate(date = as.Date(date, format = "%Y-%m-%d"))

  ### Handling Outliers
  Q1 <- quantile(df$total_oil, 0.25)
  Q3 <- quantile(df$total_oil, 0.75)
  IQR <- Q3 - Q1

  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR

  outliers <- which(df$total_oil < lower_bound | df$total_oil > upper_bound)

  median_value <- median(df$total_oil, na.rm = TRUE)
  df$total_oil[outliers] <- median_value
  
  split_index <- round(max(df$Months_of_Production) * 0.2)
  
  df <- df |>
    select(date, total_oil, well_name)
  
  
  splits <- time_series_split(
    df,
    assess     = split_index,
    cumulative = TRUE
  )
  
  # FORECAST ----
  # * Prophet ----
  model_prophet <- prophet_reg(
    seasonality_yearly = FALSE
  ) %>%
    set_engine("prophet") %>%
    fit(total_oil ~ date, training(splits))
  
  # * Prophet XGBoost----
  model_prophet_xg <- prophet_boost(
    seasonality_yearly = FALSE
  ) %>%
    set_engine("prophet_xgboost") %>%
    fit(total_oil ~ date + as.numeric(date) + month(date, label = TRUE),
        training(splits))
  
  # * AUTO ARIMA ----
  model_arima <- arima_reg() %>%
    set_engine("auto_arima") %>%
    fit(total_oil ~ date, training(splits))
  
  # STLM ETS
  model_arima_boost <- arima_boost() |>
    set_engine("auto_arima_xgboost") |>
    fit(total_oil ~ date + as.numeric(date) + month(date, label = TRUE),
        training(splits))
  
  model_arima_boost
  
  
  # MODELTIME COMPARE ----
  
  # * Modeltime Table ----
  model_tbl <- modeltime_table(
    model_arima_boost
  )
  
  
  # * Calibrate ----
  calib_tbl <- model_tbl %>%
    modeltime_calibrate(testing(splits))
  
  # * Accuracy ----
  model_accuracy <- calib_tbl %>% modeltime_accuracy()
  model_accuracy
  
  future_forecast_tbl <- calib_tbl %>%
    modeltime_refit(df) %>%
    modeltime_forecast(
      h           = "6 months",
      actual_data = df
    )
  
  plot_data <- future_forecast_tbl |>
    select(.model_desc, .key, .index, .value, .conf_lo, .conf_hi) |>
    rename(model = .model_desc,
           key = .key,
           date = .index,
           value = .value,
           conf_lo = .conf_lo,
           conf_hi = .conf_hi) |>
    mutate(Months_of_Production = row_number())
  
  
  display_forecast <- ggplot(plot_data, aes(x = Months_of_Production, y = value, color = key)) +
                        geom_line() +
                        geom_ribbon(aes(ymin = conf_lo, ymax = conf_hi), fill = "blue", alpha = 0.2) +
                        labs(title = paste0("Projected Oil Production Growth (",well,") from",
                                            year(min(plot_data$date)),"-", year(max(plot_data$date))), x = "Date", y = "Value") +
                        scale_x_continuous(breaks = seq(1, max(plot_data$Months_of_Production), by = 12)) +
                        theme_minimal()+
                        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(y = "Barrels/d",
         x = "Months")
  
  display_forecast
  
  
  return(display_forecast)
  
}
forecasted_viz_11A <- insert_well_forecast("FIELD11A")
forecasted_viz_58 <- insert_well_forecast("FIELD58")


forecasted_viz_11A

forecasted_viz_58